knitr::opts_chunk$set(echo = TRUE)
# Set working directory
setwd("C:\\Users\\Ran\\OneDrive\\cabi-trip-history-data")
# Remove exisiting data
rm(list=ls(all=TRUE))
# Load libraries
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(scales)
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
##
## here
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(ggmap)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Link for Capital Bike Share program in Washington DC https://s3.amazonaws.com/capitalbikeshare-data/index.html
# Public data for Capital Bike Share program in Washington DC
load("FinalDataNew.rdata")
FinalDataNew <- FinalData
head(FinalDataNew)
## TotalDurationMs StartDate EndDate
## 117692 489000 2010-09-15 14:05:00 2010-09-15 14:13:00
## 117691 970000 2010-09-16 10:11:00 2010-09-16 10:27:00
## 117690 14000 2010-09-17 11:00:00 2010-09-17 11:00:00
## 117689 76000 2010-09-17 11:04:00 2010-09-17 11:05:00
## 117688 9000 2010-09-17 11:05:00 2010-09-17 11:06:00
## 117686 7000 2010-09-17 11:19:00 2010-09-17 11:19:00
## StartStationNumber StartStation EndStationNumber
## 117692 31009 27th & Crystal Dr 31009
## 117691 31500 4th & Adams St NE 31500
## 117690 31101 14th & V St NW 31101
## 117689 31101 14th & V St NW 31101
## 117688 31101 14th & V St NW 31101
## 117686 31101 14th & V St NW 31101
## EndStation BikeNumber AccountType
## 117692 27th & Crystal Dr W00294 Casual
## 117691 4th & Adams St NE W00377 Casual
## 117690 14th & V St NW W00824 Registered
## 117689 14th & V St NW W00824 Registered
## 117688 14th & V St NW W00824 Registered
## 117686 14th & V St NW W00824 Registered
dim(FinalDataNew) # the length of the data is 13,623,103
## [1] 16108666 9
ggplot(SummaryDateAndTime, aes(x = hour, y = N, colour = weekday)) +
geom_point(data = weekday_summary, aes(group = weekday)) +
geom_line(data = weekday_summary, aes(group = weekday)) +
scale_x_discrete("hour") +
scale_y_continuous("N") +
theme_minimal() +
ggtitle("Different demands in weekday and weekend.\n") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.title=element_text(size=18))
# Create a dataframe with StartStationNumber and Weekday
StartStationinWeek.df <- data.frame(FinalDataNew$StartStationNumber, SummaryDateAndTime$weekday)
colnames(StartStationinWeek.df) <- c('StartStationNumber', 'SummaryWeekday')
head(StartStationinWeek.df)
## StartStationNumber SummaryWeekday
## 1 31009 Wednesday
## 2 31500 Thursday
## 3 31101 Friday
## 4 31101 Friday
## 5 31101 Friday
## 6 31101 Friday
weekends <- c('Saturday', 'Sunday') #define Sat and Sun as weekend
# If the day is Saturday or Sunday, labels the day as "weekend", other wise labels it as "weekday"
StartStationinWeek.df$SummaryWeekday <- factor((StartStationinWeek.df$SummaryWeekday %in% weekends),
levels=c(TRUE, FALSE), labels=c('weekend', 'weekday'))
colnames(StartStationinWeek.df) <- c('StartStationNumber', 'WeekdayOrWeekend')
head(StartStationinWeek.df)
## StartStationNumber WeekdayOrWeekend
## 1 31009 weekday
## 2 31500 weekday
## 3 31101 weekday
## 4 31101 weekday
## 5 31101 weekday
## 6 31101 weekday
library(plyr)
SummaryStartStationinWeek.df <- count(StartStationinWeek.df, c('StartStationNumber', 'WeekdayOrWeekend'))
head(SummaryStartStationinWeek.df)
## StartStationNumber WeekdayOrWeekend freq
## 1 31000 weekend 2607
## 2 31000 weekday 6765
## 3 31001 weekend 6751
## 4 31001 weekday 15738
## 5 31002 weekend 8331
## 6 31002 weekday 20980
SummaryStationWeekday.df <- subset(SummaryStartStationinWeek.df[c(1,3)],
SummaryStartStationinWeek.df$WeekdayOrWeekend == 'weekday' &
!is.na(SummaryStartStationinWeek.df$StartStationNumber))
SummaryStationWeekend.df <- subset(SummaryStartStationinWeek.df[c(1,3)],
SummaryStartStationinWeek.df$WeekdayOrWeekend == 'weekend' &
!is.na(SummaryStartStationinWeek.df$StartStationNumber))
# Scale the freq in weekday and weekend
SummaryStationWeekday.df$freq <- SummaryStationWeekday.df$freq/5
SummaryStationWeekend.df$freq <- SummaryStationWeekend.df$freq/2
# Sort the Station by freq (from high to low)
SummaryStationWeekday.df <- SummaryStationWeekday.df[order(-SummaryStationWeekday.df$freq),]
SummaryStationWeekend.df <- SummaryStationWeekend.df[order(-SummaryStationWeekend.df$freq),]
# Find the top 5 rental station in weekday
Top5StationInWeekday.df <- SummaryStationWeekday.df[1:5,]
TopStationInWeekday.df <- cbind(Top5StationInWeekday.df[1], 'Weekday')
colnames(TopStationInWeekday.df)[2] <- 'Week'
# Find the top 5 rental station in weekend
Top5StationInWeekend.df <- SummaryStationWeekend.df[1:5,]
TopStationInWeekend.df <- cbind(Top5StationInWeekend.df[1], 'Weekend')
colnames(TopStationInWeekend.df)[2] <- 'Week'
# Merge the top 5 rental staions in weekday and weekend
TopStationInWeek.df <- merge(TopStationInWeekday.df,TopStationInWeekend.df, by = 'StartStationNumber', all = TRUE)
TopStationInWeek.df$Week.x <- as.character(TopStationInWeek.df$Week.x)
# If the station in included in both weekday and weekend stations label it as top stations all week
TopStationInWeek.df$Week.x[!is.na(TopStationInWeek.df$Week.x) & !is.na(TopStationInWeek.df$Week.y)] <- 'AllWeek'
TopStationInWeek.df$Week.x[is.na(TopStationInWeek.df$Week.x)] <- 'Weekend'
TopStationInWeek.df <- TopStationInWeek.df[1:2]
colnames(TopStationInWeek.df)[2] <- 'TopRentalStations'
TopStationInWeek.df
## StartStationNumber TopRentalStations
## 1 31101 Weekend
## 2 31200 AllWeek
## 3 31201 AllWeek
## 4 31229 Weekday
## 5 31623 Weekday
## 6 31241 Weekday
## 7 31247 Weekend
## 8 31258 Weekend
p <- DCMap + geom_point(data=TopStationInWeek.df2, aes(x = long, y = lat, color = TopRentalStations), size=5, alpha=1)
p
# rm(list=ls(all=TRUE)) # remove previous data
load("DemandPredictionDataSet.rdata") # load dataframe for predictive model
head(DemandPredictionDataSet)
## StartStationNumber AccountType year month day weekday hour minute
## 117692 31009 Casual 2010 9 15 Wednesday 14 5
## 117691 31500 Casual 2010 9 16 Thursday 10 11
## 117690 31101 Registered 2010 9 17 Friday 11 0
## 117689 31101 Registered 2010 9 17 Friday 11 4
## 117688 31101 Registered 2010 9 17 Friday 11 5
## 117686 31101 Registered 2010 9 17 Friday 11 19
## EndStationNumber end_year end_month end_day end_weekday end_hour
## 117692 31009 2010 9 15 Wednesday 14
## 117691 31500 2010 9 16 Thursday 10
## 117690 31101 2010 9 17 Friday 11
## 117689 31101 2010 9 17 Friday 11
## 117688 31101 2010 9 17 Friday 11
## 117686 31101 2010 9 17 Friday 11
#split the traning dataset and testing dataset by 80:20
library(caTools)
set.seed(123)
split = sample.split(DemandPredictionDataSet$StartStationNumber, SplitRatio = 0.6)
training_set = subset(DemandPredictionDataSet, split == TRUE)
test_set = subset(DemandPredictionDataSet, split == FALSE)
# Count rental demand in each hour by start station
HourlyDemandPrediction.df <- count(training_set,c(1,3,4,5,7))
# Count rental demand in each hour by end station
HourlyDemandPrediction.end.df <- count(training_set,c(9,10,11,13,14))
TopStations <- count(training_set,c('StartStationNumber'))
TopStations$StartStationNumber <- as.numeric(as.character(TopStations$StartStationNumber))
# Sort the stations by frequency
TopStations <- TopStations[order(-TopStations$freq), ]
# Get top 10 stations
Top10Stations <- head(TopStations$StartStationNumber,10)
Top10Stations
## [1] 31200 31623 31201 31258 31247 31229 31241 31101 31214 31613
# Top X station
X <- 8
StationX <- Top10Stations[X]
# Create a dataset for the rental demand in Station X by hour and weekday
# Get the data only for Station X
StationX.df <- subset(training_set, training_set$StartStationNumber == StationX)
# Count the rental demand in Station X by hour
StationX.summary <- ddply(StationX.df,.(month,weekday,hour),summarise,N = length(hour))
StationX.summary.detail <- ddply(StationX.df,.(year,month,day,weekday,hour),summarise,N = length(hour))
# Count the returned bike in Station X by hour
EndStationX.df <- subset(training_set, training_set$EndStationNumber == StationX)
EndStationX.summary.detail <- ddply(EndStationX.df,.(end_year,end_month,end_day,end_weekday,end_hour),
summarise,N = length(end_hour))
colnames(EndStationX.summary.detail)[1:5] <- c('year','month','day','weekday','hour')
# Count the rental demand in Station X by hour and weekday
StationX.DayAndHour <- aggregate(N ~ weekday + hour, data = StationX.summary, FUN = sum)
# Visualize the rental demand in Station X by hour and weekday
ggplot(StationX.DayAndHour, aes(x = as.factor(hour), y = N, colour = weekday)) +
geom_point(data = StationX.DayAndHour, aes(group = weekday)) +
geom_line(data = StationX.DayAndHour, aes(group = weekday)) +
scale_x_discrete("hour") +
scale_y_continuous("N") +
theme_minimal() +
ggtitle(sprintf("Rental Demand in Station %d", Top10Stations[X])) +
theme_minimal() +
theme(plot.title=element_text(size=18))+
theme(plot.title = element_text(hjust = 0.5))
library(plotly)
# Get the rental demand of station X on Mondays at each hour and each month
Demand <- xtabs(N ~ hour + month, data = subset(StationX.summary, StationX.summary$weekday == 'Monday'))
plot_ly(x = 1:12, z = Demand, type = "surface") %>%
layout(scene = list(title = "Rental Demand on Saturdays in Station 31623, x = month, y= hour, z1 = count", xaxis = list(title = "Month"), yaxis = list(title = "Hour"), zaxis = list(title = "Demand")))
# Stations that use 12th polynomial model
Model1Station <- c(31623, 31201, 31258, 31247, 31229, 31214, 31101, 31613)
# Stations that use 10th polynomial model
Model2Station <- c(31200, 31241)
# 12th polynomial model for weekday
weekdayPolyModel1 <- function(input_dataset){
dataset.out.weekday <- input_dataset
dataset.out.weekday$year_p2 <- dataset.out.weekday$year^2
dataset.out.weekday$month_p2 <- dataset.out.weekday$month^2
dataset.out.weekday$hour_p2 <- dataset.out.weekday$hour^2
dataset.out.weekday$hour_p3 <- dataset.out.weekday$hour^3
dataset.out.weekday$hour_p4 <- dataset.out.weekday$hour^4
dataset.out.weekday$hour_p5 <- dataset.out.weekday$hour^5
dataset.out.weekday$hour_p6 <- dataset.out.weekday$hour^6
dataset.out.weekday$hour_p7 <- dataset.out.weekday$hour^7
dataset.out.weekday$hour_p8 <- dataset.out.weekday$hour^8
dataset.out.weekday$hour_p9 <- dataset.out.weekday$hour^9
dataset.out.weekday$hour_p10 <- dataset.out.weekday$hour^10
dataset.out.weekday$hour_p11 <- dataset.out.weekday$hour^11
dataset.out.weekday$hour_p12 <- dataset.out.weekday$hour^12
# Fitting the polynomial regression
regressor.weekday <- lm(formula = out ~ year+year_p2+month+month_p2+hour+hour_p2+hour_p3+hour_p4
+hour_p5+hour_p6+hour_p7+hour_p8+hour_p9+hour_p10+hour_p11+hour_p12,
data = dataset.out.weekday)
# Function returns the regressor for weekday
return(regressor.weekday)
}
# 10th polynomial model for weekday
weekdayPolyModel2 <- function(input_dataset){
dataset.out.weekday <- input_dataset
dataset.out.weekday$year_p2 <- dataset.out.weekday$year^2
dataset.out.weekday$month_p2 <- dataset.out.weekday$month^2
dataset.out.weekday$hour_p2 <- dataset.out.weekday$hour^2
dataset.out.weekday$hour_p3 <- dataset.out.weekday$hour^3
dataset.out.weekday$hour_p4 <- dataset.out.weekday$hour^4
dataset.out.weekday$hour_p5 <- dataset.out.weekday$hour^5
dataset.out.weekday$hour_p6 <- dataset.out.weekday$hour^6
dataset.out.weekday$hour_p7 <- dataset.out.weekday$hour^7
dataset.out.weekday$hour_p8 <- dataset.out.weekday$hour^8
dataset.out.weekday$hour_p9 <- dataset.out.weekday$hour^9
dataset.out.weekday$hour_p10 <- dataset.out.weekday$hour^10
# Fitting the polynomial regression
regressor.weekday <- lm(formula = out ~ year+year_p2+month+month_p2+hour+hour_p2+hour_p3+hour_p4+
hour_p5+hour_p6+hour_p7+hour_p8+hour_p9+hour_p10,
data = dataset.out.weekday)
return(regressor.weekday)
}
# 4th polynomial model for weekend
weekendPolyModel <- function(input_dataset){
dataset.out.weekday <- input_dataset
dataset.out.weekend$year_p2 <- dataset.out.weekend$year^2
dataset.out.weekend$month_p2 <- dataset.out.weekend$month^2
dataset.out.weekend$hour_p2 <- dataset.out.weekend$hour^2
dataset.out.weekend$hour_p3 <- dataset.out.weekend$hour^3
dataset.out.weekend$hour_p4 <- dataset.out.weekend$hour^4
# Fitting the polynomial regression
regressor.weekend <- lm(formula = out ~ year+year_p2+month+month_p2+
hour+hour_p2+hour_p3+hour_p4,
data = dataset.out.weekend)
# Function returns the regressor for weekday
return(regressor.weekend)
}
# Function to select the model for weekday prediction
predictStationXinWeekday <- function(SelectStation){
# If the station number is in Model1 then use Model1 (12th poly)
if (SelectStation %in% Model1Station){
RegressorWeekday = weekdayPolyModel1(dataset.out.weekday)
}
# If the station number is in Model2 then use Model2 (10th poly)
if (SelectStation %in% Model2Station){
RegressorWeekday = weekdayPolyModel2(dataset.out.weekday)
}
return(RegressorWeekday)
}
# Function to use model (4th poly) for weekend prediction
predictStationXinWeekend <- function(SelectStation){
RegressorWeekend = weekendPolyModel(dataset.out.weekend)
return(RegressorWeekend)
}
# Function to create the predicted dataset in weekday
predictedWeekdayDemand <- function(SelectStation,Year,Month,Hour){
# Select the model 1 (12th poly)
if (SelectStation %in% Model1Station){
modeldata <- data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4,
hour_p5 = Hour^5, hour_p6 = Hour^6, hour_p7 = Hour^7, hour_p8 = Hour^8,
hour_p9 = Hour^9, hour_p10 = Hour^10, hour_p11 = Hour^11, hour_p12 = Hour^12)
}
# Select the model 2 (10th poly)
if (SelectStation %in% Model2Station){
modeldata <- data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4,
hour_p5 = Hour^5, hour_p6 = Hour^6, hour_p7 = Hour^7, hour_p8 = Hour^8,
hour_p9 = Hour^9, hour_p10 = Hour^10)
}
# Create the predicted dataset in weekday
predictedWeekdayData = cbind.data.frame(hour = Hour,
demand = predict(myPolyModelWeekday,newdata = modeldata))
# Return the predicted dataset in weekday
return(predictedWeekdayData)
}
# Function to create the predicted dataset in weekend
predictedWeekendDemand <- function(SelectStation,Year,Month,Hour){
# Use 4th poly model
modeldata <- data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4)
# Create the predicted dataset in weekend
predictedWeekendData = cbind.data.frame(hour = Hour,
demand = predict(myPolyModelWeekend,newdata = modeldata))
# Return the predicted dataset in weekend
return(predictedWeekendData)
}
# Select station X
X <- 1
StationX <- Top10Stations[X]
# Create a dataset for the rental demand in Station X by hour and weekday
# Count the rental demand in Station X by hour
StationX.df <- subset(training_set, training_set$StartStationNumber == StationX)
StationX.test.df <- subset(test_set, test_set$StartStationNumber == StationX)
# Count the rental demand in Station X by hour
StationX.summary <- ddply(StationX.df,.(month,weekday,hour),summarise,N = length(hour))
StationX.summary.detail <- ddply(StationX.df,.(year,month,day,weekday,hour),summarise,N = length(hour))
StationX.test.summary <- ddply(StationX.test.df,.(month,weekday,hour),summarise,N = length(hour))
StationX.test.summary.detail <- ddply(StationX.test.df,.(year,month,day,weekday,hour),summarise,N = length(hour))
# Count the returned bike in Station X by hour
EndStationX.df <- subset(training_set, training_set$EndStationNumber == StationX)
EndStationX.summary.detail <- ddply(EndStationX.df,.(end_year,end_month,end_day,end_weekday,end_hour),
summarise,N = length(end_hour))
colnames(EndStationX.summary.detail)[1:5] <- c('year','month','day','weekday','hour')
StationX.demand.detail <- merge.data.frame(StationX.summary.detail, EndStationX.summary.detail,
by = c('year','month','day','weekday','hour'),
all = TRUE)
StationX.demand.detail[is.na(StationX.demand.detail)] <- 0
colnames(StationX.demand.detail)[6:7] <- c('out','In')
StationX.demand.detail$netout <- StationX.demand.detail$out - StationX.demand.detail$In
dataset.out <- StationX.demand.detail[c(1:5,6)]
#####
EndStationX.test.df <- subset(test_set, test_set$EndStationNumber == StationX)
EndStationX.test.summary.detail <- ddply(EndStationX.test.df,.(end_year,end_month,end_day,end_weekday,end_hour),
summarise,N = length(end_hour))
colnames(EndStationX.test.summary.detail)[1:5] <- c('year','month','day','weekday','hour')
StationX.test.demand.detail <- merge.data.frame(StationX.test.summary.detail, EndStationX.test.summary.detail,
by = c('year','month','day','weekday','hour'),
all = TRUE)
StationX.test.demand.detail[is.na(StationX.test.demand.detail)] <- 0
colnames(StationX.test.demand.detail)[6:7] <- c('out','In')
StationX.test.demand.detail$netout <- StationX.test.demand.detail$out - StationX.test.demand.detail$In
dataset.test.out <- StationX.test.demand.detail[c(1:5,6)]
# Create the dataset for weekday and weekend
weekend.position <- dataset.out$weekday == 'Saturday' | dataset.out$weekday == 'Sunday'
dataset.out.weekday <- subset(dataset.out, weekend.position == FALSE)
dataset.out.weekend <- subset(dataset.out, weekend.position == TRUE)
weekend.test.position <- dataset.test.out$weekday == 'Saturday' | dataset.test.out$weekday == 'Sunday'
dataset.test.out.weekday <- subset(dataset.test.out, weekend.test.position == FALSE)
dataset.test.out.weekend <- subset(dataset.test.out, weekend.test.position == TRUE)
PredictYear <- 2015
PredictMonth <- 6
hour_grid <- seq(0, 23, 0.1)
# Predict weekday rental demand in Station X
myPolyModelWeekday <- predictStationXinWeekday(StationX)
summary(myPolyModelWeekday)
##
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour +
## hour_p2 + hour_p3 + hour_p4 + hour_p5 + hour_p6 + hour_p7 +
## hour_p8 + hour_p9 + hour_p10, data = dataset.out.weekday)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8505 -2.0335 -0.3559 1.6109 21.0966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.561e+05 2.137e+04 -35.386 < 2e-16 ***
## year 7.510e+02 2.123e+01 35.381 < 2e-16 ***
## year_p2 -1.865e-01 5.271e-03 -35.376 < 2e-16 ***
## month 1.454e+00 2.292e-02 63.458 < 2e-16 ***
## month_p2 -1.033e-01 1.713e-03 -60.273 < 2e-16 ***
## hour -1.335e+00 5.278e-01 -2.530 0.011416 *
## hour_p2 2.361e+00 6.449e-01 3.660 0.000252 ***
## hour_p3 -1.684e+00 3.102e-01 -5.430 5.67e-08 ***
## hour_p4 5.179e-01 7.815e-02 6.628 3.46e-11 ***
## hour_p5 -8.126e-02 1.163e-02 -6.986 2.88e-12 ***
## hour_p6 7.178e-03 1.078e-03 6.661 2.76e-11 ***
## hour_p7 -3.698e-04 6.283e-05 -5.886 4.00e-09 ***
## hour_p8 1.090e-05 2.242e-06 4.863 1.16e-06 ***
## hour_p9 -1.672e-07 4.471e-08 -3.741 0.000184 ***
## hour_p10 9.989e-10 3.817e-10 2.617 0.008870 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.214 on 32594 degrees of freedom
## Multiple R-squared: 0.4326, Adjusted R-squared: 0.4324
## F-statistic: 1775 on 14 and 32594 DF, p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekdayData = predictedWeekdayDemand(StationX,PredictYear,PredictMonth,hour_grid)
# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.out.weekday$year == PredictYear & dataset.out.weekday$month == PredictMonth
plotData <- dataset.out.weekday[PointToPlot1, c(5,6)]
# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekday <- ggplot()+
geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
geom_line(data = predictedWeekdayData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
ggtitle(paste("Training set: Weekday Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
xlab('Hour') +
ylab('Demand')+
theme(plot.title = element_text(hjust = 0.5))
plotWeekday
#Predict weekend rental demand in Station X
myPolyModelWeekend <- predictStationXinWeekend(StationX)
summary(myPolyModelWeekend)
##
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour +
## hour_p2 + hour_p3 + hour_p4, data = dataset.out.weekend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5719 -2.2630 -0.4089 1.7136 25.5815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.272e+05 3.549e+04 -20.49 <2e-16 ***
## year 7.224e+02 3.525e+01 20.49 <2e-16 ***
## year_p2 -1.794e-01 8.754e-03 -20.49 <2e-16 ***
## month 1.748e+00 3.754e-02 46.57 <2e-16 ***
## month_p2 -1.287e-01 2.808e-03 -45.82 <2e-16 ***
## hour -3.147e+00 8.027e-02 -39.21 <2e-16 ***
## hour_p2 6.606e-01 1.457e-02 45.33 <2e-16 ***
## hour_p3 -4.016e-02 9.398e-04 -42.73 <2e-16 ***
## hour_p4 7.508e-04 1.986e-05 37.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.35 on 13417 degrees of freedom
## Multiple R-squared: 0.425, Adjusted R-squared: 0.4246
## F-statistic: 1239 on 8 and 13417 DF, p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekendData = predictedWeekendDemand(StationX,PredictYear,PredictMonth,hour_grid)
# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.out.weekend$year == PredictYear & dataset.out.weekend$month == PredictMonth
plotData <- dataset.out.weekend[PointToPlot1, c(5,6)]
# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekend <- ggplot()+
geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
geom_line(data = predictedWeekendData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
ggtitle(paste("Training set: Weekend Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
xlab('Hour') +
ylab('Demand')+
theme(plot.title = element_text(hjust = 0.5))
plotWeekend
PredictYear <- 2015
PredictMonth <- 6
hour_grid <- seq(0, 23, 0.1)
# Predict weekday rental demand in Station X
myPolyModelWeekday <- predictStationXinWeekday(StationX)
summary(myPolyModelWeekday)
##
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour +
## hour_p2 + hour_p3 + hour_p4 + hour_p5 + hour_p6 + hour_p7 +
## hour_p8 + hour_p9 + hour_p10, data = dataset.out.weekday)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8505 -2.0335 -0.3559 1.6109 21.0966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.561e+05 2.137e+04 -35.386 < 2e-16 ***
## year 7.510e+02 2.123e+01 35.381 < 2e-16 ***
## year_p2 -1.865e-01 5.271e-03 -35.376 < 2e-16 ***
## month 1.454e+00 2.292e-02 63.458 < 2e-16 ***
## month_p2 -1.033e-01 1.713e-03 -60.273 < 2e-16 ***
## hour -1.335e+00 5.278e-01 -2.530 0.011416 *
## hour_p2 2.361e+00 6.449e-01 3.660 0.000252 ***
## hour_p3 -1.684e+00 3.102e-01 -5.430 5.67e-08 ***
## hour_p4 5.179e-01 7.815e-02 6.628 3.46e-11 ***
## hour_p5 -8.126e-02 1.163e-02 -6.986 2.88e-12 ***
## hour_p6 7.178e-03 1.078e-03 6.661 2.76e-11 ***
## hour_p7 -3.698e-04 6.283e-05 -5.886 4.00e-09 ***
## hour_p8 1.090e-05 2.242e-06 4.863 1.16e-06 ***
## hour_p9 -1.672e-07 4.471e-08 -3.741 0.000184 ***
## hour_p10 9.989e-10 3.817e-10 2.617 0.008870 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.214 on 32594 degrees of freedom
## Multiple R-squared: 0.4326, Adjusted R-squared: 0.4324
## F-statistic: 1775 on 14 and 32594 DF, p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekdayData = predictedWeekdayDemand(StationX,PredictYear,PredictMonth,hour_grid)
# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.test.out.weekday$year == PredictYear & dataset.test.out.weekday$month == PredictMonth
plotData <- dataset.test.out.weekday[PointToPlot1, c(5,6)]
# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekday <- ggplot()+
geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
geom_line(data = predictedWeekdayData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
ggtitle(paste("Testing set: Weekday Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
xlab('Hour') +
ylab('Demand')+
theme(plot.title = element_text(hjust = 0.5))
plotWeekday
#Predict weekend rental demand in Station X
myPolyModelWeekend <- predictStationXinWeekend(StationX)
summary(myPolyModelWeekend)
##
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour +
## hour_p2 + hour_p3 + hour_p4, data = dataset.out.weekend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5719 -2.2630 -0.4089 1.7136 25.5815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.272e+05 3.549e+04 -20.49 <2e-16 ***
## year 7.224e+02 3.525e+01 20.49 <2e-16 ***
## year_p2 -1.794e-01 8.754e-03 -20.49 <2e-16 ***
## month 1.748e+00 3.754e-02 46.57 <2e-16 ***
## month_p2 -1.287e-01 2.808e-03 -45.82 <2e-16 ***
## hour -3.147e+00 8.027e-02 -39.21 <2e-16 ***
## hour_p2 6.606e-01 1.457e-02 45.33 <2e-16 ***
## hour_p3 -4.016e-02 9.398e-04 -42.73 <2e-16 ***
## hour_p4 7.508e-04 1.986e-05 37.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.35 on 13417 degrees of freedom
## Multiple R-squared: 0.425, Adjusted R-squared: 0.4246
## F-statistic: 1239 on 8 and 13417 DF, p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekendData = predictedWeekendDemand(StationX,PredictYear,PredictMonth,hour_grid)
# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.test.out.weekend$year == PredictYear & dataset.test.out.weekend$month == PredictMonth
plotData <- dataset.test.out.weekend[PointToPlot1, c(5,6)]
# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekend <- ggplot()+
geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
geom_line(data = predictedWeekendData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
ggtitle(paste("Testing set: Weekend Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
xlab('Hour') +
ylab('Demand')+
theme(plot.title = element_text(hjust = 0.5))
plotWeekend